Helpful resource for network visualization in R: https://mr.schochastics.net/

library(tidyverse)
library(readr)
library(gridExtra)
library(haven)
library(RColorBrewer)
library(igraph)
library(tidygraph) 
library(ggraph)
library(stringr)
library(kableExtra)
# reddit data
discussion_data <- read_csv("../data/anon/discussions_anon.csv")

# preprocessed survey data
sample <- read_csv("../data/anon/scaled_sample.csv")

# full data
full_data <- read_csv("../data/anon/full_data_waves.csv")

Comment Graphs conditions

subreddits <- c("DiscussPolitics1", "DiscussPolitics2", "DiscussPolitics3", 
                "DiscussPolitics4", "DiscussPolitics5", "DiscussPolitics6")

plots <- list()

for (i in seq_along(subreddits)) {
  subreddit_name <- subreddits[i]
  
  edge_list <- discussion_data %>%
    filter(subreddit == subreddit_name) %>%
    mutate(to = comment_id,
           from = parent_id) %>%
    select(from, to) %>%
    na.omit()
  
  tidy_g <- as_tbl_graph(edge_list, directed = TRUE)
  
  community_color <- case_when(
    i %in% c(1, 6) ~ "#70facb",
    i %in% c(2, 5) ~ "#6699FF",
    i %in% c(3, 4) ~ "#29c195"
  )
  
  comment_graph <- ggraph(tidy_g, "tree") +
    geom_edge_link0(alpha = 0.2) +
    geom_node_point(color = community_color) +
    theme_void() 
  
  plots[[i]] <- comment_graph
  
  ggsave(filename = paste0("../output/", subreddit_name, "_comment_graph_cond.pdf"),
         plot = comment_graph, width = 10, height = 10)
}

combi <- grid.arrange(plots[[1]], plots[[2]], plots[[3]], ncol = 3)

Toxicity comment graphs

subreddits <- c("DiscussPolitics1", "DiscussPolitics2", "DiscussPolitics3", 
                "DiscussPolitics4", "DiscussPolitics5", "DiscussPolitics6")

plots <- list()

for (i in seq_along(subreddits)) {
  subreddit_name <- subreddits[i]
  
  edge_list <- discussion_data %>%
    filter(subreddit == subreddit_name) %>%
    mutate(to = comment_id,
           from = parent_id) %>%
    select(from, to) %>%
    na.omit()
  
  tidy_g <- as_tbl_graph(edge_list, directed = TRUE)
  
  note_attributes <- discussion_data %>%
    mutate(name = comment_id,
           type = ifelse(str_detect(parent_id, "t3_"), "post", "comment")) %>%
    select(name, comment_toxicity, length_comment_char, type)
  
  tidy_g <- tidy_g %>%
    left_join(., note_attributes, by = "name")
  
  comment_graph <- ggraph(tidy_g, "tree") +
    geom_edge_link0(alpha = 0.2) +
    geom_node_point(shape = 21, aes(fill = comment_toxicity, size = length_comment_char), alpha = 0.8) +
    guides(alpha = "none", edge_alpha = "none",
           fill = guide_legend(title = "Toxicity"),
           size = guide_legend(title = "Length")) +
    scale_fill_gradient(low = "#6699ff", high = "#bc3455", limits = c(0, 0.6)) +
    scale_size(range = c(2, 6)) +
    theme_void() +
    ggtitle(subreddit_name)
  
  plots[[i]] <- comment_graph
}

combined_plot <- grid.arrange(grobs = plots, ncol = 2, nrow = 3)

ggsave("../output/comment_graphs_combined.pdf", combined_plot, width = 20, height = 20)

User Interaction Network

plots <- list()

for (i in seq_along(subreddits)) {
  subreddit_name <- subreddits[i]
  
  user_edge_list <- discussion_data %>%
    filter(subreddit == subreddit_name) %>%
    #  'from' as the user who made the parent comment and 'to' as the user who replied
    mutate(to = ParticipantID, 
           from = discussion_data$ParticipantID[match(parent_id, discussion_data$comment_id)]) %>%
    select(from, to) %>%
    na.omit()  # remove NA where parent_id does not match
  
  # get all users who participated in the discussion (even without replies)
  all_users <- discussion_data %>%
    filter(subreddit == subreddit_name) %>%
    select(ParticipantID) %>%
    distinct() %>%
    rename(name = ParticipantID)
  
  user_tidy_g <- as_tbl_graph(user_edge_list, directed = TRUE) %>%
    activate(nodes) %>%
    # add all users to ensure isolated nodes are included
    full_join(all_users, by = "name")
  
  user_attributes <- discussion_data %>%
    group_by(ParticipantID) %>%
    summarize(comment_count = n(),
              avg_toxicity = mean(comment_toxicity, na.rm = TRUE)) %>%
    mutate(name = ParticipantID)  
  
  user_tidy_g <- user_tidy_g %>%
    left_join(., user_attributes, by = "name")
  
  user_graph <- ggraph(user_tidy_g, layout = "fr") +  # Fruchterman-Reingold layout
    geom_edge_link0(alpha = 0.2) +
    geom_node_point(shape = 21, aes(fill = avg_toxicity, size = comment_count),alpha = 0.8) +
    guides(alpha = "none", edge_alpha = "none",
           fill = guide_legend(title = "Avg Toxicity"),
           size = guide_legend(title = "Comment Count")) +
    scale_fill_gradient(low = "#6699ff", high = "#bc3455", limits = c(0, 0.5)) +
    scale_size(range = c(2, 6)) +
    theme_void() +
    ggtitle(paste("User Interaction Network -", subreddit_name))
  
  plots[[i]] <- user_graph
}

combined_plot <- grid.arrange(grobs = plots, ncol = 2, nrow = 3)

ggsave("../output/user_interaction_graphs_combined.pdf", combined_plot, width = 20, height = 20)

Degree distributions

degree_plots <- list()

for (subreddit_name in subreddits) {
  
  user_edge_list <- discussion_data %>%
    filter(subreddit == subreddit_name) %>%
    mutate(to = ParticipantID, 
           from = discussion_data$ParticipantID[match(parent_id, discussion_data$comment_id)]) %>%
    select(from, to) %>%
    na.omit()  # Remove rows where parent_id does not match
  
  user_tidy_g <- as_tbl_graph(user_edge_list, directed = TRUE)
  
  degree_data <- user_tidy_g %>%
    mutate(in_degree = centrality_degree(mode = "in"),
           out_degree = centrality_degree(mode = "out")) %>%
    as_tibble()
  
  degree_plot <- ggplot(degree_data, aes(x = in_degree, fill = "In-Degree")) +
    geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
    geom_histogram(aes(x = out_degree, fill = "Out-Degree"), alpha = 0.6, position = "identity", bins = 30) +
    scale_fill_manual(values = c("In-Degree" = "#6699ff", "Out-Degree" = "#bc3455")) +
    labs(title = paste("Degree Centrality Distribution -", subreddit_name),
         x = "Degree Centrality",
         y = "Count",
         fill = "Degree Type") +
    theme_bw()
  
  degree_plots[[subreddit_name]] <- degree_plot
}

combined_plot <- grid.arrange(grobs = degree_plots, nrow = 3, ncol = 2)

ggsave("../output/combined_degree_centrality_distribution.pdf", combined_plot, width = 15, height = 15)

Differences in degree distributions?

Kolmogorov-Smirnov (KS) Test

non-parametric test that compares two distributions and determines whether they come from the same distribution.

Test Metrics: 1. D-statistic: Maximum difference between the empirical distributions (cumulative distribution functions). 2. P-value: Probability of observing the D-statistic under the null hypothesis that the two distributions are the same. 3. ECDFs: Cumulative probability of the data points in each sample.

# Define subreddit conditions
control_subreddits <- c("DiscussPolitics1", "DiscussPolitics6")
moderation_subreddits <- c("DiscussPolitics2", "DiscussPolitics5")
incentives_subreddits <- c("DiscussPolitics3", "DiscussPolitics4")

combined_degree_data <- data.frame()

for (subreddit_name in subreddits) {
  
  user_edge_list <- discussion_data %>%
    filter(subreddit == subreddit_name) %>%
    mutate(to = ParticipantID, 
           from = discussion_data$ParticipantID[match(parent_id, discussion_data$comment_id)]) %>%
    select(from, to) %>%
    na.omit()
  
  user_tidy_g <- as_tbl_graph(user_edge_list, directed = TRUE)
  
  degree_data <- user_tidy_g %>%
    mutate(in_degree = centrality_degree(mode = "in"),
           out_degree = centrality_degree(mode = "out")) %>%
    as_tibble() %>%
    mutate(subreddit = subreddit_name,
           condition = case_when(
             subreddit_name %in% control_subreddits ~ "control",
             subreddit_name %in% moderation_subreddits ~ "moderation",
             subreddit_name %in% incentives_subreddits ~ "incentives"
           ))  
  
  combined_degree_data <- rbind(combined_degree_data, degree_data)
}

#  KS test for pairwise comparisons between conditions
perform_ks_test <- function(degree_data, centrality_type) {
  control_data <- degree_data %>% filter(condition == "control") %>% pull(!!centrality_type)
  moderation_data <- degree_data %>% filter(condition == "moderation") %>% pull(!!centrality_type)
  incentives_data <- degree_data %>% filter(condition == "incentives") %>% pull(!!centrality_type)
  
  ks_tests <- list(
    control_vs_moderation = ks.test(control_data, moderation_data),
    control_vs_incentives = ks.test(control_data, incentives_data),
    moderation_vs_incentives = ks.test(moderation_data, incentives_data)
  )
  
  results <- lapply(ks_tests, function(test) {
    data.frame(
      Comparison = names(test),
      D_statistic = test$statistic,
      P_value = round(test$p.value, 4),
      Centrality_Type = centrality_type
    )
  })
  
  do.call(rbind, results)
}

ks_in_degree <- perform_ks_test(combined_degree_data, "in_degree")
ks_out_degree <- perform_ks_test(combined_degree_data, "out_degree")

ks_results <- rbind(ks_in_degree, ks_out_degree)%>%
  select(-Comparison) %>%
  distinct()  

html_table <- ks_results %>%
  kbl(caption = "Pairwise KS Test Results for Degree Centrality Distributions") %>%
  kable_styling(full_width = F, bootstrap_options = c( "hover", "condensed"))

html_table
Pairwise KS Test Results for Degree Centrality Distributions
D_statistic P_value Centrality_Type
control_vs_moderation.1 0.0943396 0.7611 in_degree
control_vs_incentives.1 0.1003080 0.6848 in_degree
moderation_vs_incentives.1 0.0973639 0.5623 in_degree
control_vs_moderation.11 0.0536557 0.9987 out_degree
control_vs_incentives.11 0.0889488 0.8152 out_degree
moderation_vs_incentives.11 0.0946003 0.5689 out_degree
save_kable(html_table, file = "../output/ks_test_results.html")

Degree centrality and toxicity

combined_degree_data <- combined_degree_data%>%
  mutate(ParticipantID = name)

data <- full_data%>%
  left_join(., combined_degree_data, by = "ParticipantID")

indegree <- ggplot(data, aes(in_degree, comment_mean_tox, color = comment_mean_tox))+
  geom_point(aes(fill = comment_mean_tox, size = 2, alpha = 0.5 ))+
  scale_color_gradient(low = "#6699ff", high = "#bc3455", limits = c(0, 0.6)) +
  theme_bw()+
  geom_smooth(method = loess, alpha = 0.1, color = "darkgrey")+
  ylab("User level average comment toxicity")+
  xlab("User reply network in-degree")+
  ggtitle("Toxicity by In-Degree")+
  guides(size = "none", alpha = "none", fill = "none", color = "none")

outdegree <- ggplot(data, aes(out_degree, comment_mean_tox, color = comment_mean_tox))+
  geom_point(aes(fill = comment_mean_tox, size = 2, alpha = 0.5 ))+
  scale_color_gradient(low = "#6699ff", high = "#bc3455", limits = c(0, 0.6)) +
  theme_bw()+
  geom_smooth(method = loess, alpha = 0.1, color = "darkgrey")+
  ylab("User level average comment toxicity")+
  xlab("User reply network out-degree")+
  ggtitle("Toxicity by Out-Degree")+
  guides(size = "none", alpha = "none", fill = "none",
         color = guide_legend(title="Toxicity"))

degrees <- grid.arrange(indegree, outdegree, nrow = 1, widths = c(4,5))

ggsave("../output/toxicity_degree.pdf", plot = degrees, width = 10, height = 5)

Modeling Drop-out with Survival Models

library(survival)
library(survminer)

# observation period's end (e.g., 30 days after start)
end_of_observation <- as.Date(min(discussion_data$created_comment)) + 30

# Prepare the survival data
survival_data <- discussion_data %>%
  group_by(ParticipantID) %>%
  summarize(discussion_start = min(created_comment),
    last_comment_date = max(created_comment),
    time = as.numeric(pmin(last_comment_date, end_of_observation) - discussion_start),
    status = ifelse(last_comment_date <= end_of_observation, 1, 0))%>%
  left_join(.,sample,by = "ParticipantID")

# h(t) hazard's function: risk of dying at time t, given the covariates. Covariates > 0 increase in hazard, # covariates < 0 decrease in hazard / negative predictor for dropout.    
res.cox <- coxph(Surv(time,status) ~ condition + group_toxicity, data = survival_data) 
summary(res.cox)
## Call:
## coxph(formula = Surv(time, status) ~ condition + group_toxicity, 
##     data = survival_data)
## 
##   n= 331, number of events= 324 
##    (4 observations deleted due to missingness)
## 
##                        coef exp(coef) se(coef)      z Pr(>|z|)   
## conditionincentives -0.4491    0.6382   0.1370 -3.277  0.00105 **
## conditionmoderation -0.3222    0.7246   0.1564 -2.060  0.03940 * 
## group_toxicity       0.3088    1.3618   0.1335  2.314  0.02070 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## conditionincentives    0.6382     1.5669    0.4879    0.8349
## conditionmoderation    0.7246     1.3801    0.5333    0.9845
## group_toxicity         1.3618     0.7343    1.0483    1.7691
## 
## Concordance= 0.538  (se = 0.019 )
## Likelihood ratio test= 14.01  on 3 df,   p=0.003
## Wald test            = 14.06  on 3 df,   p=0.003
## Score (logrank) test = 14.23  on 3 df,   p=0.003
#Kaplan-Meier survival curves for different groups
surv_fit <- survfit(Surv(time, status) ~ condition, data = survival_data)

ggsurvplot(surv_fit,
  data = survival_data,
  ggtheme = theme_bw(),
  palette = "Dark2", 
  conf.int = TRUE,  
  legend.title = "Experimental Condition", # Customize legend title
  legend.labs = c("Control", "Moderation", "Incentives"), 
  xlab = "Days",  
  xlim = c(0, 30) ,
  ylab = "Probability for Continued Participation")

# split along toxicity
toxicity_threshold <- quantile(survival_data$group_toxicity, 0.8, na.rm = TRUE)

survival_data <- survival_data %>%
  mutate(toxicity_group = ifelse(group_toxicity > toxicity_threshold, "High", "Low"))

surv_fit <- survfit(Surv(time, status) ~ toxicity_group, data = survival_data)

ggsurvplot(
  surv_fit,
  data = survival_data,
  ggtheme = theme_bw(),
  palette = c("#E7B800", "#2E9FDF"), # Distinct colors for High and Low
  conf.int = TRUE,   # Add confidence intervals
  legend.title = "Toxicity Group", # Customize legend title
  legend.labs = c("High Toxicity", "Low Toxicity"), # Rename groups
  xlab = "Days",     # Label for x-axis
  ylab = "Survival Probability", # Label for y-axis
  xlim = c(0, 30)    # Limit x-axis range
)